library(tidyverse)
library(lubridate)
library(readr)
library("ggplot2")
library("dplyr")
library(xts)
library("lubridate")
library("RColorBrewer")
library("ggthemes")
library("gridExtra")
library("leaflet")
library("highcharter")
library(scales)
library(leaflet.extras)
rats_raw <- read.csv("./Rat_Sightings.csv", na = c("", "NA", "N/A", "Unspecified")) %>%
janitor::clean_names() %>%
mutate(created_date = mdy_hms(created_date)) %>%
mutate(sighting_year = year(created_date),
sighting_month_num = month(created_date),
sighting_month = month(created_date, label = TRUE, abbr = FALSE),
sighting_day = day(created_date),
sighting_weekday = wday(created_date, label = TRUE, abbr = FALSE))
rats_raw
Overall trend in rats
overall <- rats_raw %>%
group_by(sighting_year, sighting_month_num, sighting_day) %>%
summarize(count = n()) %>%
mutate(date = as.Date(paste(sighting_year, sighting_month_num, sighting_day, sep = "-")))
## `summarise()` has grouped output by 'sighting_year',
## 'sighting_month_num'. You can override using the
## `.groups` argument.
time_series = xts(overall$count , order.by= overall$date)
hchart(time_series, name = "Rat Sightings") %>%
hc_add_theme(hc_theme_darkunica()) %>%
hc_credits(enabled = TRUE, text = "Sources: City of New York", style = list(fontSize = "12px")) %>%
hc_title(text = "Time Series of NYC Rat Sightings") %>%
hc_legend(enabled = TRUE)
There seems to be an increasing trend of rat sightings in NYC from 2009 to 2023.
Trend in rats by borough over time
borough_over_time <- rats_raw %>%
group_by(sighting_year, sighting_month_num, sighting_day, borough) %>%
summarize(count = n()) %>%
mutate(date = as.Date(paste(sighting_year, sighting_month_num, sighting_day, sep = "-"))) %>%
filter(!is.na(borough))
## `summarise()` has grouped output by 'sighting_year',
## 'sighting_month_num', 'sighting_day'. You can override
## using the `.groups` argument.
ggplot(borough_over_time, aes(x = date, y = count, color = borough)) +
geom_line() +
labs(title = "Entries Over Time by Borough",
x = "Year",
y = "Count") +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
theme_minimal()
From this graph, we can see that Brooklyn seems to have the most rat
sightings in NYC. However, we are unable to see the trend in rat
sightings in the Bronx.
borough_over_time_wo_brooklyn <- rats_raw %>%
filter(borough != "BROOKLYN") %>%
filter(borough != "MANHATTAN") %>%
group_by(sighting_year, sighting_month_num, sighting_day, borough) %>%
summarize(count = n()) %>%
mutate(date = as.Date(paste(sighting_year, sighting_month_num, sighting_day, sep = "-"))) %>%
filter(!is.na(borough))
## `summarise()` has grouped output by 'sighting_year',
## 'sighting_month_num', 'sighting_day'. You can override
## using the `.groups` argument.
borough_over_time_wo_brooklyn
ggplot(borough_over_time_wo_brooklyn, aes(x = date, y = count, color = borough)) +
geom_line() +
labs(title = "Entries Over Time by Borough",
x = "Year",
y = "Count") +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
theme_minimal()
It seems like the rat sightings in Bronx and Queens are comparable but
further EDA is needed to see where the Bronx stands in terms of rat
sightings.
Count of rats by year
by_year <- rats_raw %>%
group_by(sighting_year) %>%
count() %>%
ggplot(aes(x = sighting_year, y = n, fill = n)) +
geom_histogram(stat = "identity", position = "dodge") +
theme(legend.position ='none',axis.title = element_text(),axis.text.x = element_text(size = 12)) +
xlab("Year") +
ylab("Count") +
geom_text(aes(label = n), vjust = -0.1, size = 3.75) +
ggtitle('Count of Rat Sightings through the Years') +
scale_fill_gradientn(name = '',colours = rev(brewer.pal(10,'Spectral')))
## Warning in geom_histogram(stat = "identity", position = "dodge"): Ignoring unknown parameters: `binwidth`, `bins`, and
## `pad`
by_year
There was a huge jump in total number of rat sightings from 2020 to
2021.
Counts of rats by month
by_month <- rats_raw %>%
group_by(sighting_month) %>%
count() %>%
ggplot(aes(x = sighting_month, y = n, fill = n)) +
geom_histogram(stat = "identity", position = "dodge") +
theme(legend.position ='none',axis.title = element_text(),axis.text.x = element_text(size = 9)) +
xlab("Month") +
ylab("Count") +
geom_text(aes(label = n), vjust = -0.1, size = 3.75) +
ggtitle('Count of Rat Sightings by Month') +
scale_fill_gradientn(name = '',colours = rev(brewer.pal(10,'Spectral')))
## Warning in geom_histogram(stat = "identity", position = "dodge"): Ignoring unknown parameters: `binwidth`, `bins`, and
## `pad`
by_month
It seems like there are the most rat sightings in the warmer months with
the peak being at July.
Counts of rat by day of the week
by_day <- rats_raw %>%
group_by(sighting_weekday) %>%
count() %>%
ggplot(aes(x = sighting_weekday, y = n, fill = n)) +
geom_histogram(stat = "identity", position = "dodge") +
theme(legend.position ='none',axis.title = element_text(),axis.text.x = element_text(size = 12)) +
xlab("Weekday") +
ylab("Count") +
geom_text(aes(label = n), vjust = -0.1, size = 4) +
ggtitle('Count of Rat Sightings by Day of Week') +
scale_fill_gradientn(name = '',colours = rev(brewer.pal(10,'Spectral')))
## Warning in geom_histogram(stat = "identity", position = "dodge"): Ignoring unknown parameters: `binwidth`, `bins`, and
## `pad`
by_day
There are more rat sighings reported during the weekdays compared to
weekends.
by_location_type <- rats_raw %>%
group_by(location_type) %>%
filter(location_type != "Other (Explain Below)") %>%
count() %>%
arrange(desc(n)) %>%
head(30)
by_location_type
ggplot(by_location_type, aes(x = location_type, y = n)) +
geom_histogram(stat = "identity", position = "dodge") +
labs(title = "Sightings by Location Type",
x = "Location Type",
y = "Count") +
theme_minimal() +
coord_flip() +
geom_text(aes(label = n), hjust = -0.01, size = 3)
## Warning in geom_histogram(stat = "identity", position = "dodge"): Ignoring unknown parameters: `binwidth`, `bins`, and
## `pad`

## Overall Sightings Map and Heat Map
top = 40.917577 # north lat
left = -74.259090 # west long
right = -73.700272 # east long
bottom = 40.477399 # south lat
nyc = rats_raw %>%
filter(latitude >= bottom) %>%
filter ( latitude <= top) %>%
filter( longitude >= left ) %>%
filter(longitude <= right)
center_lon = median(nyc$longitude,na.rm = TRUE)
center_lat = median(nyc$latitude,na.rm = TRUE)
count = nyc %>%
group_by(location) %>%
count()
count
nyc = merge(nyc, count, by = "location")
factpal = colorFactor("blue", nyc$n)
nyc %>%
leaflet() %>%
addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircles(lng = ~longitude, lat = ~latitude) %>%
setView(lng=center_lon, lat=center_lat,zoom = 10)
nyc %>%
leaflet() %>%
addProviderTiles("Esri.NatGeoWorldMap") %>%
addHeatmap(lng = ~longitude, lat = ~latitude, intensity = ~(nyc$n), blur = 20, max = 0.05, radius = 15) %>%
setView(lng=center_lon, lat=center_lat,zoom = 10)